In [1]:
import pandas as pd
# Load the dataset
df = pd.read_csv('AI in Healthcare Ethiopia (Responses) - Form Responses 1.csv')
# Quick look at the first few rows
df.head()
Out[1]:
| Timestamp | Age | Sex | Marital Status | Which Region are currently working in? | Level of Education | Monthly Income (in Birr) | Professional Role | How many years of experience do you have in your fied? | Artificial Intelligence (AI) is like a smart software or a tool that can do tasks like a human. | ... | AI might replace human doctors in the future. | Benefits outweigh risks; improvements are worth the risks. | Doctors and healthcare students should learn about AI. | AI in healthcare needs regulation and responsible use. | More research is needed to understand AI’s impact. | I use AI tools in my healthcare profession only if they are validated | I’m confident and comfortable using AI tools. | I verify AI outputs before implementing them. | I trust AI results and recommend them to colleagues | I keep up with AI developments in healthcare. | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3/31/2024 10:52:36 | 32 | Female | Single | Addis Ababa | Bachelor Degree | 10000 | Physician / Doctor / Surgeon | 0 | TRUE | ... | Disagree | Agree | Strongly Agree | Strongly Agree | Strongly Agree | Never | Never | Never | Never | Always |
| 1 | 3/31/2024 11:18:57 | 26 | Male | Single | Addis Ababa | Bachelor Degree | 200 | Physician / Doctor / Surgeon | 2 | TRUE | ... | Disagree | Agree | Agree | Strongly Agree | Strongly Agree | Always | Always | Always | Always | Always |
| 2 | 3/31/2024 11:30:48 | 30 | Male | Married | Gambella | Bachelor Degree | 8018 | Public Health Officer | 5 | TRUE | ... | Strongly Agree | Agree | Agree | Agree | Strongly Agree | Always | Always | Always | Always | Always |
| 3 | 3/31/2024 12:09:09 | 26 | Male | Single | SNNPR | Postgraduate Degree (Master’s, Professional, o... | 8000 | Public Health Officer | 3 | TRUE | ... | Disagree | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Always | Always | Always | Always | Always |
| 4 | 3/31/2024 12:12:13 | 24 | Female | Single | Amhara | Postgraduate Degree (Master’s, Professional, o... | 9000 | Biostatistician | 1 | TRUE | ... | Neutral | Strongly Agree | Strongly Agree | Strongly Agree | Strongly Agree | Always | Always | Never | Never | Always |
5 rows × 30 columns
In [3]:
# Define a dictionary of {Old Name : New Name}
new_names = {
'Marital Status': 'Marital_Status',
'Which Region are currently working in?': 'Region',
'Level of Education': 'Education_Level',
'Monthly Income (in Birr)': 'Income',
'Professional Role': 'Role',
'How many years of experience do you have in your fied?': 'Experience_Years',
# Knowledge Questions
'Artificial Intelligence (AI) is like a smart software or a tool that can do tasks like a human.': 'Knw_AI_Definition',
'AI learns from data it’s trained with and can understand everyday language to handle complex tasks.': 'Knw_AI_Learning',
'AI assists healthcare professionals by enhancing diagnosis, treatment, research, education, and management.': 'Knw_AI_Assist',
'AI helps diagnose diseases by analyzing images, symptoms, test results, and other data.': 'Knw_AI_Diagnosis',
'AI suggests optimal medicines and dosages for treatment.': 'Knw_AI_Treatment',
'AI is used in radiology, pathology, surgery, pharmacy, research, and public health.': 'Knw_AI_Usage',
'Limitations include data quality, privacy, security, bias, errors, and ethical issues.': 'Knw_AI_Limitations',
'Ethical concerns involve informed consent, accountability, responsibility, and transparency.': 'Knw_AI_Ethics',
# Attitude Questions
'AI can solve complex problems and improve health outcomes.': 'Att_Solve_Problems',
'AI enhances accessibility, especially for remote areas.': 'Att_Accessibility',
'AI reduces workload for healthcare professionals and optimizes resources.': 'Att_Workload',
'AI might replace human doctors in the future.': 'Att_Replace_Doctors',
'Benefits outweigh risks; improvements are worth the risks.': 'Att_Benefits_Risks',
'Doctors and healthcare students should learn about AI.': 'Att_Learn_AI',
'AI in healthcare needs regulation and responsible use.': 'Att_Regulation',
'More research is needed to understand AI’s impact.': 'Att_Research_Need',
# Practice Questions
'I use AI tools in my healthcare profession only if they are validated': 'Prac_Use_Validated',
'I’m confident and comfortable using AI tools.': 'Prac_Confidence',
'I verify AI outputs before implementing them.': 'Prac_Verify_Output',
'I trust AI results and recommend them to colleagues': 'Prac_Trust_Recommend',
'I keep up with AI developments in healthcare.': 'Prac_Keep_Updated'
}
# Rename the columns
df = df.rename(columns=new_names)
In [4]:
def clean_role(text):
if not isinstance(text, str):
return "Other"
text = text.lower().strip()
# Grouping Logic
if 'nurse' in text or 'midwife' in text:
return 'Nurse/Midwife'
elif 'doctor' in text or 'physician' in text or 'surgeon' in text or 'gp' in text or 'intern' in text:
return 'Medical Doctor'
elif 'public health' in text or 'epidemiologist' in text or 'environment' in text:
return 'Public Health'
elif 'pharm' in text:
return 'Pharmacist'
elif 'lab' in text or 'microbiolog' in text or 'biomed' in text:
return 'Lab/Biomedical'
elif 'lecturer' in text or 'teacher' in text or 'lecurer' in text: # Fix typo 'lecurer'
return 'Academic/Lecturer'
elif 'data' in text or 'statistic' in text or 'informatic' in text:
return 'Data/IT/Stats'
elif 'anesthe' in text or 'ansthetist' in text: # Fix typo 'ansthetist'
return 'Anesthetist'
elif 'student' in text:
return 'Student'
else:
return 'Other'
# Apply the function
df['Role'] = df['Role'].apply(clean_role)
In [5]:
def clean_education(text):
if not isinstance(text, str):
return "Unknown"
if 'Bachelor' in text:
return 'Bachelor'
elif 'Master' in text or 'Postgraduate' in text:
return 'Masters/Postgrad'
elif 'Doctoral' in text or 'PhD' in text:
return 'PhD'
else:
return 'Other'
df['Education_Level'] = df['Education_Level'].apply(clean_education)
In [6]:
def clean_marital(text):
if not isinstance(text, str):
return "Unknown"
return text.split(' / ')[0] # Simplifies "Divorced / Widowed" to just "Divorced" or similar if needed
df['Marital_Status'] = df['Marital_Status'].apply(clean_marital)
In [7]:
import pandas as pd
# 1. Define the cleaning function
def clean_region_v2(text):
if not isinstance(text, str):
return "Unknown"
# Convert to lowercase and strip spaces for easier matching
t = text.lower().strip()
# --- REGION MATCHING LOGIC ---
# 1. Addis Ababa (Handles: "A.A", "Addis", "A̲. A̲", "Finfinne")
if any(x in t for x in ['addis', 'adis', 'aa', 'a.a', 'finfinne', 'a̲']):
return 'Addis Ababa'
# 2. Oromia (Handles: "Jimma", "Shoa", "Hareghe", "Meta welabu", "Limmu")
if any(x in t for x in ['oromia', 'oromiya', 'jimma', 'shoa', 'hareghe', 'meta welabu', 'limmu']):
return 'Oromia'
# 3. Amhara
if 'amhara' in t:
return 'Amhara'
# 4. Sidama (Handles: "Sidaamu", "Sidam")
if any(x in t for x in ['sidama', 'sidaamu', 'sidam']):
return 'Sidama'
# 5. Southern / Central / South West
# A. South West Ethiopia
# (Must check this BEFORE "South Ethiopia" to catch 'South West' correctly)
if any(x in t for x in ['swe', 'south west']):
return 'South West Ethiopia'
# B. Central Ethiopia
if any(x in t for x in ['central', 'centeral', 'gurage', 'werabe']):
return 'Central Ethiopia'
# C. South Ethiopia
if any(x in t for x in ['snnpr', 'snne', 'southern', 'south', 'debub', 'amu']):
return 'South Ethiopia'
# 6. Somali (Handles: "Jijiga")
if any(x in t for x in ['somali', 'somale', 'jijiga']):
return 'Somali'
# 7. Afar
if 'afar' in t:
return 'Afar'
# 8. Harari
if 'harar' in t:
return 'Harari'
# 9. Dire Dawa
if 'dire' in t:
return 'Dire Dawa'
# 10. Gambella
if 'gambella' in t:
return 'Gambella'
# 11. Benishangul Gumuz
if 'benishangul' in t or 'beneshagul' in t or 'gumuz' in t:
return 'Benishangul Gumuz'
# 12. Tigray
if 'tigray' in t:
return 'Tigray'
# --- EXCLUSION LOGIC ---
# If it didn't match a region above, check if it's explicitly invalid data
if any(x in t for x in ['student', 'job', 'unemployed', 'orthodox', 'muslim',
'protestant', 'evangelical', 'islam', 'sudan', 'afghanistan',
'04', 'east africa', 'graduate', 'nursing']):
return 'Other/Invalid'
# Default for anything else not caught
return 'Other/Invalid'
# 2. Apply the function
df['Region'] = df['Region'].apply(clean_region_v2)
# 3. Verify the results
print(df['Region'].value_counts())
Region Addis Ababa 166 Oromia 113 Other/Invalid 28 Sidama 27 Amhara 24 Central Ethiopia 23 South Ethiopia 18 Somali 10 South West Ethiopia 8 Harari 8 Afar 8 Tigray 6 Dire Dawa 4 Gambella 3 Benishangul Gumuz 3 Name: count, dtype: int64
In [8]:
# Convert all roles to Title Case (e.g., "public health" -> "Public Health")
df['Role'] = df['Role'].str.title()
In [9]:
# Convert 'Region' to Title Case (e.g., "addis ababa" -> "Addis Ababa")
df['Region'] = df['Region'].str.title()
In [10]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
In [11]:
df.to_csv('AI_Healthcare_Cleaned.csv', index=False)
In [19]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
# 1. Load Your Data
df = pd.read_csv('AI_Healthcare_Cleaned.csv')
# 2. Calculate "Adoption Rate" by Region
# We use .agg() to avoid the warning and calculate the % of High Usage
region_stats = df.groupby('Region').agg(
Adoption_Rate=('Prac_Use_Validated', lambda x: x.isin(['Often', 'Always']).mean() * 100),
Respondent_Count=('Region', 'count')
).reset_index()
# 3. Load the Shapefile (The Map Borders)
# We specify 'layer="eth_admin1"' to get Regions, not the whole country
shapefile_path = "eth_admin_boundaries.shp"
gdf = gpd.read_file(shapefile_path, layer="eth_admin1")
# 4. Fix Spelling Mismatches (Crucial Step!)
# This maps your Survey Names (Left) to the Map File Names (Right)
name_fix = {
'Benishangul Gumuz': 'Benishangul Gumz',
'Gambella': 'Gambela',
'Central Ethiopia': 'SNNP', # Mapping new regions to the old SNNP shape
'South Ethiopia': 'SNNP',
'South West Ethiopia': 'SNNP' # Often part of SNNP in older maps, or check if map has it
}
# Apply the fix
region_stats['Region_Map_Name'] = region_stats['Region'].replace(name_fix)
# 5. Merge Data
# We group by the NEW map name to handle the SNNP merge (combining South + Central)
final_stats = region_stats.groupby('Region_Map_Name')['Adoption_Rate'].mean().reset_index()
# Join with the shapefile
merged = gdf.set_index('adm1_name').join(final_stats.set_index('Region_Map_Name'))
# 6. Plot the Map
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
merged.plot(column='Adoption_Rate',
cmap='OrRd', # Orange to Red color scheme
linewidth=0.8,
ax=ax,
edgecolor='0.8',
legend=True,
legend_kwds={'label': "AI Adoption Rate (%)", 'orientation': "horizontal"},
missing_kwds={'color': 'lightgrey', 'label': 'No Data'})
ax.axis('off')
ax.set_title('AI Adoption Intensity by Region in Ethiopia', fontdict={'fontsize': '15', 'fontweight' : '3'})
plt.show()
In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Set visual style for professional reports
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)
# Load the data (Make sure the file is in the same folder)
df = pd.read_csv('AI_Healthcare_Cleaned.csv')
# Quick check
print(f"Total Respondents: {len(df)}")
df.head(3)
Total Respondents: 449
Out[20]:
| Timestamp | Age | Sex | Marital_Status | Region | Education_Level | Income | Role | Experience_Years | Knw_AI_Definition | ... | Att_Replace_Doctors | Att_Benefits_Risks | Att_Learn_AI | Att_Regulation | Att_Research_Need | Prac_Use_Validated | Prac_Confidence | Prac_Verify_Output | Prac_Trust_Recommend | Prac_Keep_Updated | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024-03-31 10:52:36 | 32 | Female | Single | Addis Ababa | Bachelor | 10000 | Medical Doctor | 0 | TRUE | ... | Disagree | Agree | Strongly Agree | Strongly Agree | Strongly Agree | Never | Never | Never | Never | Always |
| 1 | 2024-03-31 11:18:57 | 26 | Male | Single | Addis Ababa | Bachelor | 200 | Medical Doctor | 2 | TRUE | ... | Disagree | Agree | Agree | Strongly Agree | Strongly Agree | Always | Always | Always | Always | Always |
| 2 | 2024-03-31 11:30:48 | 30 | Male | Married | Gambella | Bachelor | 8018 | Public Health | 5 | TRUE | ... | Strongly Agree | Agree | Agree | Agree | Strongly Agree | Always | Always | Always | Always | Always |
3 rows × 30 columns
In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Create a dashboard for demographics
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# 1. Professional Roles (Top 5)
top_roles = df['Role'].value_counts().nlargest(5)
# --- THE FIX IS HERE ---
# We added 'hue=top_roles.index' and 'legend=False'
sns.barplot(
x=top_roles.values,
y=top_roles.index,
hue=top_roles.index, # Explicitly tell it to color based on the Role name
legend=False, # Hide the legend (redundant since y-axis has labels)
ax=axes[0],
palette='viridis'
)
axes[0].set_title('Top 5 Professional Roles')
axes[0].set_xlabel('Number of Respondents')
# 2. Experience Distribution
# Ensure experience is numeric
df['Experience_Years'] = pd.to_numeric(df['Experience_Years'], errors='coerce')
sns.histplot(df['Experience_Years'].dropna(), bins=15, kde=True, ax=axes[1], color='#3498db')
axes[1].set_title('Years of Experience Distribution')
axes[1].set_xlabel('Years')
plt.tight_layout()
plt.show()
In [24]:
# Cell 3: Attitude Analysis (The "Fear" vs. "Optimism" Gap)
# Define the order for Likert scales so charts make sense
likert_order = ['Strongly Disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree']
# Create a comparison plot
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
# Plot 1: Will AI replace doctors?
sns.countplot(
x='Att_Replace_Doctors',
hue='Att_Replace_Doctors', # FIX 1: Explicitly link color to the x-variable
data=df,
order=likert_order,
ax=ax[0],
palette='RdYlBu',
legend=False # FIX 2: Turn off the legend since we have labels
)
ax[0].set_title('Statement: "AI will replace human doctors"')
ax[0].tick_params(axis='x', labelrotation=45) # FIX 3: The modern way to rotate labels
# Plot 2: Do benefits outweigh risks?
sns.countplot(
x='Att_Benefits_Risks',
hue='Att_Benefits_Risks', # FIX 1
data=df,
order=likert_order,
ax=ax[1],
palette='RdYlGn',
legend=False # FIX 2
)
ax[1].set_title('Statement: "Benefits outweigh the risks"')
ax[1].tick_params(axis='x', labelrotation=45) # FIX 3
plt.tight_layout()
plt.show()
# Calculate exact percentage for your report text
fear_pct = df['Att_Replace_Doctors'].isin(['Agree', 'Strongly Agree']).mean() * 100
optimism_pct = df['Att_Benefits_Risks'].isin(['Agree', 'Strongly Agree']).mean() * 100
print(f"Fear Factor: {fear_pct:.1f}% believe they might be replaced.")
print(f"Optimism Factor: {optimism_pct:.1f}% believe benefits outweigh risks.")
Fear Factor: 33.0% believe they might be replaced. Optimism Factor: 56.3% believe benefits outweigh risks.
In [26]:
# Cell 4: Practice Analysis (Actual Usage)
# Analyze usage frequency
usage_order = ['Never', 'Rarely', 'Sometimes', 'Often', 'Always']
plt.figure(figsize=(10, 6))
# --- THE FIX IS HERE ---
ax = sns.countplot(
y='Prac_Use_Validated',
hue='Prac_Use_Validated', # Explicitly tell it to color based on usage frequency
data=df,
order=usage_order,
palette='magma',
legend=False # Hide the legend
)
plt.title('How often do professionals use validated AI tools?')
plt.xlabel('Count')
# Add percentages on the bars
total = len(df)
for p in ax.patches:
# Only label bars that actually exist
if p.get_width() > 0:
percentage = '{:.1f}%'.format(100 * p.get_width()/total)
x = p.get_x() + p.get_width() + 3
y = p.get_y() + p.get_height()/2
ax.annotate(percentage, (x, y), va='center')
plt.show()
In [27]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# Load Data
df = pd.read_csv('AI_Healthcare_Cleaned.csv')
# Define a "Modern" Color Palette (Teal, Slate, Coral - very clean)
colors_modern = ['#264653', '#2a9d8f', '#e9c46a', '#f4a261', '#e76f51']
In [28]:
# Create a hierarchy: Region -> Sex -> Usage
fig_sun = px.sunburst(df,
path=['Region', 'Sex', 'Prac_Use_Validated'],
title='<b>Digital Equity Map:</b> AI Adoption by Region & Gender',
color_discrete_sequence=px.colors.qualitative.Pastel)
fig_sun.update_layout(margin=dict(t=40, l=0, r=0, b=0), font_family="Arial")
fig_sun.show()
In [29]:
# Prepare Data: Group by Role to see market segments
# Calculate average Knowledge Score (0-8) and Trust %
# Convert Trust/Knowledge to numeric for plotting
knw_cols = [c for c in df.columns if c.startswith('Knw_')]
df['Knw_Score'] = df[knw_cols].astype(str).apply(lambda x: x.str.contains('TRUE|True', case=False)).sum(axis=1)
# Calculate Trust Score (Frequency converted to 1-5 scale for average)
freq_map = {'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4, 'Always': 5}
df['Trust_Score'] = df['Prac_Trust_Recommend'].map(freq_map)
# Group by Role
role_data = df.groupby('Role').agg(
Avg_Knowledge=('Knw_Score', 'mean'),
Avg_Trust=('Trust_Score', 'mean'),
Count=('Role', 'count')
).reset_index()
# Filter for Roles with at least 5 people (to remove noise)
role_data = role_data[role_data['Count'] > 5]
fig_bubble = px.scatter(role_data, x="Avg_Knowledge", y="Avg_Trust",
size="Count", color="Role",
hover_name="Role",
title="<b>The Market Matrix:</b> Knowledge vs. Trust by Profession",
labels={"Avg_Knowledge": "AI Literacy (Score 0-8)", "Avg_Trust": "Trust Level (1-5)"},
size_max=60)
# Add a "Sweet Spot" box for investors
fig_bubble.add_shape(type="rect", x0=6, y0=3.5, x1=8, y1=5,
line=dict(color="Green", width=2, dash="dot"),
)
fig_bubble.add_annotation(x=7, y=4.8, text="Ideally Positioned<br>Market", showarrow=False, font=dict(color="green"))
fig_bubble.update_layout(template="plotly_white")
fig_bubble.show()
In [30]:
# Calculate percentages
reg_agree = df['Att_Regulation'].isin(['Agree', 'Strongly Agree']).mean() * 100
fear_agree = df['Att_Replace_Doctors'].isin(['Agree', 'Strongly Agree']).mean() * 100
risk_agree = df['Att_Benefits_Risks'].isin(['Agree', 'Strongly Agree']).mean() * 100
# Data for Chart
categories = ['Demand for Regulation', 'Belief Benefits > Risks', 'Fear of Job Replacement']
values = [reg_agree, risk_agree, fear_agree]
colors = ['#2a9d8f', '#2a9d8f', '#e76f51'] # Green for good, Red for fear
fig_bar = go.Figure(go.Bar(
x=values,
y=categories,
orientation='h',
marker_color=colors,
text=[f"{v:.1f}%" for v in values],
textposition='auto'
))
fig_bar.update_layout(title_text='<b>Policy Pulse Check:</b> The Regulation vs. Fear Gap',
xaxis_title="Percentage of Workforce Agreeing",
template="plotly_white")
fig_bar.show()
In [31]:
import plotly.graph_objects as go
# Data Prep: Pivot Gender vs Role for 'Always' usage
heatmap_data = df[df['Prac_Use_Validated'] == 'Always'].groupby(['Role', 'Sex']).size().unstack(fill_value=0)
fig_heat = go.Figure(data=go.Heatmap(
z=heatmap_data.values,
x=heatmap_data.columns,
y=heatmap_data.index,
colorscale='Teal',
hoverongaps = False))
fig_heat.update_layout(title='<b>Gender Equity Heatmap:</b> Who are the Power Users?',
template='plotly_white')
fig_heat.show()
In [32]:
# Create Income Groups for clearer visualization
df['Income_Group'] = pd.qcut(pd.to_numeric(df['Income'], errors='coerce'), 4, labels=['Low', 'Medium', 'High', 'Elite'])
fig_violin = px.violin(df, y="Att_Replace_Doctors", x="Income_Group", color="Income_Group",
box=True, points="all",
title='<b>The Anxiety Curve:</b> Job Security Fear by Income Level',
category_orders={"Att_Replace_Doctors": ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"]},
color_discrete_sequence=px.colors.qualitative.Bold)
fig_violin.update_layout(yaxis_title="Fear Level", showlegend=False)
fig_violin.show()
In [33]:
# Prep Data: Calculate average scores (0-100%) for 4 key metrics
def get_metrics(sub_df):
return [
sub_df['Knw_AI_Definition'].apply(lambda x: 1 if str(x).upper()=='TRUE' else 0).mean(),
sub_df['Prac_Use_Validated'].apply(lambda x: 1 if x in ['Often', 'Always'] else 0).mean(),
sub_df['Att_Replace_Doctors'].isin(['Agree', 'Strongly Agree']).mean(),
sub_df['Att_Regulation'].isin(['Agree', 'Strongly Agree']).mean()
]
metrics_addis = get_metrics(df[df['Region'] == 'Addis Ababa'])
metrics_regions = get_metrics(df[df['Region'] != 'Addis Ababa'])
categories = ['Knowledge', 'High Usage', 'Fear of Job Loss', 'Demand for Regulation']
fig_radar = go.Figure()
fig_radar.add_trace(go.Scatterpolar(
r=metrics_addis,
theta=categories,
fill='toself',
name='Addis Ababa (Capital)'
))
fig_radar.add_trace(go.Scatterpolar(
r=metrics_regions,
theta=categories,
fill='toself',
name='Regional Ethiopia'
))
fig_radar.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
title='<b>The Digital Divide Radar:</b> Capital vs. Regions',
showlegend=True
)
fig_radar.show()
In [ ]: